In this Take-home Exercise, I will explore the economic of the city of Engagement, Ohio USA.
Challenge 3: Economic considers the financial health of the city. Over time, are businesses growing or shrinking? How are people changing jobs? Are standards of living improving or declining over time?
Consider the financial status of Engagement’s businesses and residents, and use visual analytic techniques to address these questions.
In this take-home exercise, appropriate static and interactive statistical graphics methods are used to reveal the economic of the city of Engagement, Ohio USA while addressing the questions stated in the Task section.
The data are processed by using appropriate tidyverse family of packages and the statistical graphics are prepared using ggplot2 and its extensions.
The picture below shows a sketch of the initial design proposed.
Before we get started, it is important for us to ensure that the required R packages have been installed. If yes, we will load the R packages. If they have yet to be installed, we will install the R packages and load them onto R environment.
The chunk code below will do the trick.
packages = c('tidyverse', 'ggdist', 'ggridges', 'patchwork', 'ggthemes', 'lubridate', 'ggiraph', 'gganimate', 'plotly', 'DT', 'crosstalk')
for(p in packages){
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
library(trelliscopejs)
The code chunk below imports Participants.csv from the data
folder into R by using read_csv()
of readr
package and save it as an tibble data frame called
participants.
travel <- read_csv("rawdata/TravelJournal.csv")
employers <- read_csv("rawdata/Employers.csv")
summary(travel)
participantId travelStartTime travelStartLocationId
Min. : 0.0 Min. :2022-03-01 05:00:00 Min. : 1
1st Qu.: 221.0 1st Qu.:2022-06-10 17:35:00 1st Qu.: 449
Median : 464.0 Median :2022-10-03 18:40:00 Median : 913
Mean : 480.5 Mean :2022-10-05 05:21:39 Mean :1016
3rd Qu.: 726.0 3rd Qu.:2023-01-28 06:20:00 3rd Qu.:1358
Max. :1010.0 Max. :2023-05-24 23:35:00 Max. :1805
NA's :1043
travelEndTime travelEndLocationId purpose
Min. :2022-03-01 05:35:00 Min. : 0 Length:2099656
1st Qu.:2022-06-10 18:10:00 1st Qu.: 449 Class :character
Median :2022-10-03 19:00:00 Median : 910 Mode :character
Mean :2022-10-05 05:46:07 Mean :1015
3rd Qu.:2023-01-28 06:45:00 3rd Qu.:1358
Max. :2023-05-24 23:55:00 Max. :1805
checkInTime checkOutTime
Min. :2022-03-01 05:35:00 Min. :2022-03-01 06:00:00
1st Qu.:2022-06-10 18:10:00 1st Qu.:2022-06-10 21:40:00
Median :2022-10-03 19:00:00 Median :2022-10-03 22:47:30
Mean :2022-10-05 05:46:07 Mean :2022-10-05 09:53:15
3rd Qu.:2023-01-28 06:45:00 3rd Qu.:2023-01-28 08:30:00
Max. :2023-05-24 23:55:00 Max. :2023-05-25 00:05:00
startingBalance endingBalance
Min. : -681.6 Min. : -640.7
1st Qu.: 5077.8 1st Qu.: 5086.4
Median : 12006.9 Median : 12019.5
Mean : 19573.7 Mean : 19590.8
3rd Qu.: 25972.4 3rd Qu.: 25992.5
Max. :240494.7 Max. :240838.8
summary(employers)
employerId location buildingId
Min. : 379 Length:253 Min. : 3.0
1st Qu.: 829 Class :character 1st Qu.: 261.0
Median :1279 Mode :character Median : 486.0
Mean :1089 Mean : 517.8
3rd Qu.:1734 3rd Qu.: 782.0
Max. :1797 Max. :1041.0
travel$month <- factor(month(travel$`travelEndTime`),
levels=1:12,
labels=month.abb,
ordered=TRUE)
travel$year <- year(travel$`travelEndTime`)
travel$year_month <- format(as.Date(travel$`travelEndTime`), "%Y-%m")
travel$day <- day(travel$`travelEndTime`)
travel$date <- date(travel$`travelEndTime`)
travel$wkday <- weekdays(travel$`travelEndTime`)
First we examine the overall trend of travel for all purpose
ggplot(data=travel,
aes(x = travelStartTime,
fill = purpose)) +
geom_histogram(bins=15,
color="black") +
scale_y_continuous(NULL,
breaks = NULL)
Next we use trellis plot to examine changes over time. However since the facet function of ggplot2 is not useful for visualizing large data (i.e. 1000 participants), we will use trelliscopejs instead.
travel_count <- travel %>%
select(year_month, purpose, participantId) %>%
group_by(year_month, purpose) %>%
summarise(count = n()) %>%
ungroup()
qplot(year_month, count, data = travel_count) +
facet_wrap(~ purpose) +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
Zooming into Work pattern
travel_to_work <- travel %>%
filter(purpose == 'Work/Home Commute') %>%
inner_join(y=employers, by = c("travelEndLocationId" = "employerId")) %>%
select(participantId, travelEndTime, year, year_month, month, day, date, wkday, travelEndLocationId, purpose, location, buildingId) %>%
rename('employerId' = 'travelEndLocationId')
Rename a value for simplicity
travel_to_work$purpose <- sub('Work/Home Commute',
'Work',
travel_to_work$purpose)
doing a count:
travel_to_work_count <- travel_to_work %>%
group_by(year_month, day) %>%
summarise(count = n()) %>%
ungroup()
by month analysis
ggplot() +
geom_line(data=travel_to_work_count,
aes(x=day,
y=count,
group=year_month),
colour="black") +
facet_grid(~year_month) +
labs(axis.text.x = element_blank()) +
xlab("") +
ylab("No. of Visitors")
qplot(day, count, data = travel_to_work_count) +
facet_wrap(~ year_month) +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
qplot(day, count, data = travel_to_work_count) +
facet_trelliscope(~ year_month, nrow = 2, ncol = 4, width = 600,
path = "trellis/",
self_contained=TRUE) +
theme(axis.text.x=element_blank(), axis.ticks.x=element_blank())
Interactive Bar graph by month
travel_to_work_by_month <- travel_to_work %>%
group_by(year_month, purpose) %>%
summarise(count = n()) %>%
ungroup()
travel_to_work_by_month$tooltip <- c(paste0(
"Purpose = ", travel_to_work_by_month$purpose,
"\n Count = ", travel_to_work_by_month$count))
p <- ggplot(data=travel_to_work_by_month,
aes(x = year_month, y = count)) +
geom_bar_interactive(aes(tooltip = travel_to_work_by_month$tooltip,
data_id = year_month),
stat="identity")
girafe(
ggobj = p,
width_svg = 12,
height_svg = 12*0.618
)
travel_to_work_daily_count <- travel_to_work %>%
group_by(employerId, year_month, day, date, wkday) %>%
summarise(count = n()) %>%
ungroup()
travel_to_work_monthly_change <- travel_to_work_daily_count %>%
group_by(employerId, year_month) %>%
summarise(monthly_employees = max(count)) %>%
mutate(mom_change = coalesce(monthly_employees - lead(monthly_employees),0),
mom_turnover_rate = coalesce((monthly_employees - lead(monthly_employees))/monthly_employees,0)) %>%
ungroup()
travel_to_work_mom <- travel_to_work_monthly_change %>%
group_by(year_month) %>%
summarise(avg_turnover = mean(mom_turnover_rate)) %>%
ungroup()
travel_to_work_mom$tooltip <- c(paste0(
"MOM Turnover % =", round(travel_to_work_mom$avg_turnover*100,1), '%'))
p2 <- ggplot(data=travel_to_work_mom,
aes(x = year_month, y = avg_turnover)) +
geom_bar_interactive(aes(tooltip = travel_to_work_mom$tooltip,
data_id = year_month),
stat="identity")
girafe(code = print(p / p2),
width_svg = 12,
height_svg = 12,
options = list(
opts_hover(css = "fill: #202020;"),
opts_hover_inv(css = "opacity:0.2;")
)
)
Now look into by wkday
wkday_levels <- c('Saturday', 'Friday',
'Thursday', 'Wednesday',
'Tuesday', 'Monday',
'Sunday')
travel_to_work_by_day <- travel_to_work_daily_count %>%
group_by(date, day, wkday, year_month) %>%
summarise(daily_employees = sum(count)) %>%
ungroup() %>%
mutate(wkday = factor(
wkday, levels = wkday_levels))
p3 <- ggplot(travel_to_work_by_day,
aes(year_month,
wkday,
fill = daily_employees)) +
geom_tile(color = "white",
size = 0.1) +
theme_tufte(base_family = "Helvetica") +
coord_equal() +
scale_fill_gradient(name = "# of attacks",
low = "sky blue",
high = "dark blue")
ggplotly(p3)
d <- highlight_key(travel_to_work_by_day)
p4 <- ggplot(d, aes(date, daily_employees)) +
geom_line()
gg <- highlight(ggplotly(p4),
"plotly_selected")
crosstalk::bscols(gg,
DT::datatable(d))